%load_ext autoreload
%autoreload 2
%matplotlib inline
# %config IPCompleter.use_jedi = False
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
import networkx as nx
import copy
import sys
from glob import glob
import os
import pandas as pd
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt
import math
from pathlib import Path
import seaborn as sns
from IPython.display import display
import joblib
from tqdm import tqdm_notebook
import warnings
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'png'
plt.rcParams['pdf.fonttype'] = 'truetype'
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['figure.dpi'] = 120
from portraits.utils import GeneSet, read_gene_sets, median_scale, ssgsea_formula, pivot_vectors
from portraits.utils import read_dataset, to_common_samples, item_series, cut_clustermap_tree
from portraits.clustering import gen_graph2
from portraits.plotting import clustering_heatmap, pca_plot, lin_colors, axis_net, patch_plot, draw_graph
default_cmap = matplotlib.cm.coolwarm
default_r_cmap = matplotlib.cm.coolwarm_r
single_color_cmap = sns.cubehelix_palette(as_cmap=True, light=0.97)
red_color = '#b40426'
l_red_color = '#f7607d'
blue_color = '#3b4cc0'
l_blue_color = '#8190f4'
green_color = '#229954'
orange_color = '#F0440D'
taupe_color = '#F8C471'
lgrey_color = '#AAAAAA'
dgrey_color = '#ccccff'
dblue_color = '#0000FF'
dl_blue_color = '#064B85'
purple_color = '#9933ff'
cyan_color = '#11D4FA'
fangipani_color = '#FAD7A0'
pink_color = '#F05EA0'
yellow_color = '#e2db00'
black_color = '#000000'
white_color = '#ffffff'
sns.set_style('white')
basedir = '.'
immuno_gmt = read_gene_sets(basedir + 'signatures.gmt')
len(immuno_gmt)
signatures_order = ['Angiogenesis',
'Endothelium',
'CAF',
'Matrix',
'Matrix_remodeling',
'Protumor_cytokines',
'Neutrophil_signature',
'Granulocyte_traffic',
'Macrophages',
'Macrophage_DC_traffic',
'MDSC_traffic',
'MDSC',
'Th2_signature',
'T_reg_traffic',
'Treg',
'M1_signatures',
'MHCII',
'Antitumor_cytokines',
'Coactivation_molecules',
'B_cells',
'NK_cells',
'Checkpoint_inhibition',
'Effector_cells',
'T_cells',
'Th1_signature',
'T_cell_traffic',
'MHCI',
'EMT_signature',
'Proliferation_rate']
pan_ann = read_dataset(basedir + '/pan_ann.tsv').dropna(subset=['Transcriptomics'])
display(pan_ann.shape)
# pan_ann = pan_ann[pan_ann.QC.isna()]
pan_ann.shape
dm_datasets = list(pan_ann.Cohort.value_counts().index)
len(dm_datasets)
exp_path = basedir + '/{}/expressions.tsv.gz'
read gene expression + simple distribution QC plot
Organize all datasets into folder structure like that
-Cohort1
--------expressions.tsv.gz
-Cohort2
--------expressions.tsv.gz
dm_genes_dst = {}
for cds in tqdm_notebook(dm_datasets):
cann = pan_ann[pan_ann.Cohort==cds]
cgenes = read_dataset(exp_path.format(cds)).T
cann, cgenes = to_common_samples([cann, cgenes])
if cgenes.mean().max()>20:
cgenes = np.log2(1+cgenes)
ax = sns.distplot(cgenes.mean())
ax.set_title(cds)
plt.show()
dm_genes_dst[cds] = cgenes
print(cds, cann.shape, cgenes.shape)
OK_cohorts = ['TCGA-SKCM',
'Cirenajwis',
'Budden',
'Liu',
'Gide',
'Badal',
'Ulloa-Montoya',
'Augustine',
'Bogunovic',
'Lauss',
'Nathanson']
qc_ok_p = {True: orange_color, False: l_blue_color}
patch_plot({'Not OK': orange_color, 'OK': l_blue_color})
for cds in tqdm_notebook(OK_cohorts):
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, title=cds, title_y = 1.05)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color}, ax=next(af), title='Genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color}, ax=next(af), title='Signatures')
plt.tight_layout()
plt.show()
clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(),
xl=False, yl=False,
title=cds)
plt.show()
cds = 'Riaz'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
SRR5088893 excluded due to low correlation with the rest samples
cds = 'Hao'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
GSM1056170, GSM1056172, GSM1056175 excluded due to low correlation and being PCA outliers in the signature space
cds = 'Kunz'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
SRR6916944 excluded due to low correlation and being PCA outliers in all genes space
cds = 'Raskin'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
GSM390267 excluded due to low correlation with the rest samples and PCA outlier
cds = 'Khan'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
SRR9097552 excluded due to bad phred scores
SRR9097556, SRR9097554 excluded due to low correlation with the rest samples and PCA outlier
AJCC_1 and AJCC_2 were recombined from cohorts GSE54467 and GSE80435 by platform
AJCC_1 - GPL6884; AJCC_2 - GPL10558_2
In AJCC_1 samples from GSE80435 have slight tehnical batch from GSE54467 in the genes space. There is no batch in the signature space. So we decided to keep all the samples
cds = 'AJCC_1 + AJCC_2'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort.isin(['AJCC_1', 'AJCC_2'])],
pd.concat([dm_genes_dst['AJCC_1'], dm_genes_dst['AJCC_2']])])
cgenes.apply(lambda x: x.isna().any(None)).value_counts()
2700 genes are missing on the GPL6884 platform
cgenes
cgenes = cgenes.dropna(axis=1)
cgenes.shape
Signatures will be calculated by platform
csigns = pd.concat([ssgsea_formula(dm_genes_dst[i], immuno_gmt) for i in ['AJCC_1', 'AJCC_2']])
csigns.shape
gse_p = {'GSE54467': blue_color,
'GSE80435': orange_color}
patch_plot(gse_p)
coh_p = {'AJCC_1': green_color,
'AJCC_2': purple_color}
patch_plot(coh_p)
g = clustering_heatmap(cgenes, col_colors=pd.concat([cann.series_id.map(gse_p),
cann.Cohort.map(coh_p)], axis=1),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([cann.series_id.map(gse_p),
cann.Cohort.map(coh_p)], axis=1),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.Cohort, palette=coh_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.Cohort, palette=coh_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
First we split the cohorts by platform. Despite the fact they do not batch on the PCA plot in the signatures' space, they were procecssed as separate batches.
We additionally checked the 6 samples on the different platform from GSE80435 that were assigned to AJCC_1. They had good correlation with all other samples on the same platform. That supports that the batch in gene expression is technical but not so big. We found the batch effect succefully corrected in the signature space. That's why they were processed with the rest AJCC_1 samples
cds = 'AJCC_1'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
g = clustering_heatmap(cgenes, col_colors=cann.series_id.map(gse_p),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=cann.series_id.map(gse_p),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
cds = 'AJCC_2'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
cds = 'Jonsson'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
Overall correlation in the all genes space is relatively low. But it is OK for illumina arrays. And all the samples are uniformly correlate with each other. We kept all the samples.
GSM550977 a little bit outliers in signature space, but it still has a good correlation with a number of samples. We decided to keep it.
cds = 'Xu'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
cann.Site.value_counts()
site_p = {'Metastasis': green_color,
'Primary': '#A569BD'}
patch_plot(site_p)
g = clustering_heatmap(cgenes, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Site.map(site_p)], axis=1),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Site.map(site_p)], axis=1),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
All samples correlation in all genes is >80%. There is a biolocig batch primary vs metastatic. It is OK.
There are no samples with low correlation with the rest. We additionally investigated a group of samples (N=9) in signature scores.
PCA Colored by Site
af = axis_net(2, 1)
pca_plot(cgenes, cann.Site, palette=site_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.Site, palette=site_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
There is an outlier group of metastatic siamples (N=9) in the signature space
# get the group of samples from the signatures clustering above
cut = cut_clustermap_tree(g, n_clusters=3).map({3: 'Group', 1: 'Others', 2: 'Others'})
cut.value_counts()
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cut, ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cut, ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
PCA Colored by Location
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, pan_ann.Location[cgenes.index], ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, pan_ann.Location[csigns.index], ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
ax = pivot_vectors(cut, pan_ann.Location).T.plot(kind='bar', stacked=True)
ax.set_title('Locations in the Group')
from scipy.stats import mannwhitneyu
pval = mannwhitneyu(pan_ann.Inflammatory_cells[pan_ann.index & cut[cut=='Group'].index].dropna(),
pan_ann.Inflammatory_cells[pan_ann.index & cut[cut=='Others'].index].dropna(),
alternative='two-sided')[1]
ax = sns.boxplot(y=pan_ann.Inflammatory_cells, x=cut)
sns.swarmplot(y=pan_ann.Inflammatory_cells, x=cut, ax=ax, color='.25')
ax.set_title(f'Pvalue: {pval:.2}')
plt.show()
The outlier group is enriched with LN metastasis location and enriched with inflammatory cells. It is again biologic batch rather than technical.
We kept all the samples
cds = 'Auslander'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cann.Patient, ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cann.Patient, ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
All samples consist of 2 groups.
All patients have multiple samples including multi-regional, technical replicates, longitudinal.
Overally the cohort is hard to work with. The dataset won't be used for any correalations. So it will be kept as is.
cds = 'Liang'
sm_p = {'FFPE': '#2471A3',
'FF': '#D35400'}
patch_plot(sm_p)
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Storage_method.map(sm_p)], axis=1),
xl=False, yl=False,
title=cds)
plt.show()
pca_plot(cgenes, cann.Storage_method, palette=sm_p, title=f'{cds} - genes', legend='out')
plt.show()
The cohort consists of samples sequenced from FFPE using Total RNAseq or from FF using Poly-A. We split the cohort by the protocol.
Hugo + Garcia-Diaz (samples were generated in 1 lab)
cds = 'Hugo'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], pd.concat([dm_genes_dst[cds], cgenes.loc[['Pt28']]])])
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=True, title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color, 'PCA': red_color, 'Duplicate': orange_color},
order=['OK', 'PCA', 'Duplicate'], ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color, 'PCA': red_color, 'Duplicate': orange_color},
order=['OK', 'PCA', 'Duplicate'], ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
Pt28 was excluded due to low correlation with the overall mass
Some dots are colored in orange - they hide other black ones. They have near 100% correaltion. The samples are re-sequenced or re-submitted samples. They match HLA and are 100% concordant by conpair algorithm.
# from bioreactor.graphs import gen_graph2, draw_graph
g = gen_graph2(cgenes.T.corr(), .96)
draw_graph(g, e_labels=False)
for cc in nx.connected_components(g):
print(sorted(list(cc)))
Samples from Garcia-Diaz overlap with Hugo
Pt23, SRR5343925, SRR5343926
Pt5, SRR5343917, SRR5343918
Pt12, SRR5343923, SRR5343924
Pt15, SRR5343919, SRR5343920
Pt10, SRR5343921, SRR5343922
Let's raise the similarity threshold to get the duplicate samples
g = gen_graph2(cgenes.T.corr(), .99)
draw_graph(g, e_labels=False)
for cc in nx.connected_components(g):
print(sorted(list(cc)))
Some samples are just copies (re-sequenced or re-deposited)
Pt5 = SRR5343917
Pt12= SRR5343924
Pt23= SRR5343925
Pt15= SRR5343919
Pt10= SRR5343921
potential label/time mix-up SRR5343923 - pre; SRR5343924 - post
This did not affect any downstream conclustions. The patient did not change the MFP label (pre/post = F)
SRR5343917, SRR5343924, SRR5343925, SRR5343919, SRR5343921 samples excluded as duplicates
cds = 'VanAllen'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=True, title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'Coverage': orange_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'Coverage': orange_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
pat41 excluded due to low coverage
pat02 excluded because it is PCA outlier
pan_ann_f = pan_ann[pan_ann.QC.isna()]
pan_ann.shape, pan_ann_f.shape
pan_ann_f.platform_id.value_counts()
RNAseqBG - recalculated from raw fastq files using XENA pipeline. Will be checked for batches and re-assignted into cohort groups
RNAseq - Liu et al. cohort downloaded from the supplements, transformed to TPM. Will form separate batch
Others - microarrays. They will form separate batches per dataset (and platform in case of AJCC)
For example here is how the process will look like on GPL570
cplatform = 'GPL570'
Here cohort = batch (cohort_group)
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by cohort
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
coh_p = {'Augustine': '#8000ff',
'Bogunovic': '#0ca7ef',
'Hao': '#6afdc0',
'Raskin': '#e0d377',
'Ulloa-Montoya': '#ff3e1f'}
patch_plot(coh_p)
g = clustering_heatmap(cgenes, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - signatures')
plt.show()
g = clustering_heatmap(csigns_sc, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - scalled signatures')
plt.show()
pca_plot(cgenes, cann.Cohort, palette=coh_p, title=f'{cplatform} - genes', legend='out')
pca_plot(csigns, cann.Cohort, palette=coh_p, title=f'{cplatform} - signatures', legend='out')
pca_plot(csigns_sc, cann.Cohort, palette=coh_p, title=f'{cplatform} - scalled signatures', legend='out')
Checking batch for all the re-calculated from raw fastq cohorts
cplatform = 'RNAseqBG'
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled as is
csigns_sc = median_scale(csigns, 4)
coh_p = {
'Auslander': '#B6B637',
'Hugo': '#CB4335',
'Nathanson': '#F39C12',
'Riaz': '#AF601A',
'TCGA-SKCM': lgrey_color,
'Liang': '#9B59B6',
'VanAllen': '#0032FF',
'Gide': '#2471A3',
'Khan': '#3CD4EC',
'Badal': '#8BC57E',
'Kunz': '#6afdc0',}
coh_o = ['TCGA-SKCM', 'Riaz', 'Auslander', 'Hugo', 'Nathanson',
'Gide', 'VanAllen', 'Khan',
'Liang',
'Badal',
'Kunz']
patch_plot(coh_p, order=coh_o)
storage_p = {'FF': blue_color,
'FFPE': orange_color}
patch_plot(storage_p)
rnae_p = {'PolyA': green_color,
'Total': purple_color}
patch_plot(rnae_p)
Let's compare all the cohorts to the biggest one - TCGA-SKCM
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Cohort[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=coh_p, order=coh_o, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Rna_Enrichment[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=rnae_p, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Storage_method[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=storage_p, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
FF+PolyA: TCGA-SKCM, Hugo, Nathanson, Kunz, Liang-FF, Riaz, Auslander
FFPE+Total: Gide, VanAllen, Khan, Liang-FFPE
FF+Total: Badal
Riaz and Auslander cohorts share the same protocol with TCGA, but have much more batch than others (Nathanson, Hugo, Kunz, Liang-FF)
cenrm = 'PolyA'
cstorem = 'FF'
cann = pan_ann_f[(pan_ann_f.platform_id==cplatform) &
(pan_ann_f.Rna_Enrichment==cenrm) &
(pan_ann_f.Storage_method==cstorem)]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled as is
csigns_sc = median_scale(csigns, 4)
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
There is no batch effect in the signature space. We will combine all the cohorts for scaling.
cann = pan_ann_f[(pan_ann_f.platform_id=='RNAseqBG') &
(pan_ann_f.Rna_Enrichment=='Total') &
(pan_ann_f.Storage_method=='FFPE')]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by cohort
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
In the signature space all cohorts are mixing together except for Khan. Khan cohort consists of brain metastasis. So it is expected to have altered TME.
We will combine all the cohorts for scaling
cplatform = 'RNAseqBG'
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by batch
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
g = clustering_heatmap(cgenes, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - signatures')
plt.show()
g = clustering_heatmap(csigns_sc, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - scalled by batch signatures')
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
dm_cohort_groups = list(pan_ann.Cohort_group.value_counts().index)
len(dm_cohort_groups)
Process all cohorts. Calculate signature values and median-transform by Cohort_group
dm_genes_cg_dst = {}
dm_raw_signatures_dst = {}
dm_signatures_sc_dst = {}
for cds in tqdm_notebook(dm_cohort_groups):
cann = pan_ann[(pan_ann.Cohort_group==cds) & (pan_ann.QC.isna()) & (pan_ann.Diagnosis!='Nevus')]
cgenes = pd.concat([dm_genes_dst[choh].loc[samps.index].dropna() for choh, samps in cann.groupby('Cohort')]).dropna(axis=1)
cann, cgenes = to_common_samples([cann, cgenes])
c_signs = ssgsea_formula(cgenes, immuno_gmt)[signatures_order]
c_signs_sc = median_scale(c_signs, 4)
dm_genes_cg_dst[cds] = cgenes
dm_raw_signatures_dst[cds] = c_signs
dm_signatures_sc_dst[cds] = c_signs_sc
print(cds, cann.shape, cgenes.shape, c_signs_sc.shape)
While combininig of all the genes from all the cohorts we will lose a lot of genes. But just to show the batch before normalization
all_samples_genes = pd.concat(dm_genes_cg_dst.values()).dropna(axis=1)
all_samples_genes.shape
all_samples_raw_signatures = pd.concat(dm_raw_signatures_dst.values())
all_samples_raw_signatures.shape
all_samples_sc_signatures = pd.concat(dm_signatures_sc_dst.values())
all_samples_sc_signatures.shape
coh_p = lin_colors(pan_ann.Cohort)
patch_plot(coh_p)
af = axis_net(3, 1)
pca_plot(all_samples_genes, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - genes', legend=False)
pca_plot(all_samples_raw_signatures, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - signatures', legend=False)
pca_plot(all_samples_sc_signatures, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - scalled signatures', legend=False)
plt.tight_layout()
plt.show()
coh_g_p = lin_colors(pan_ann.Cohort_group)
patch_plot(coh_g_p)
af = axis_net(3, 1, x_len=5.5, y_len=5)
pca_plot(all_samples_genes, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - genes', legend=False)
pca_plot(all_samples_raw_signatures, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - signatures', legend=False)
pca_plot(all_samples_sc_signatures, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
for cproc in ['CAF', 'T_cells', 'Proliferation_rate']:
us = sorted(pan_ann.Cohort_group.unique())
af = axis_net(2, len(us), x_len=2, y_len=.5, title=cproc, title_y=.95)
xlims = [all_samples_raw_signatures[cproc].min(),all_samples_raw_signatures[cproc].max()]
for cp in us:
samps = pan_ann[pan_ann.Cohort_group.astype(str)==cp]
sign = dm_raw_signatures_dst[cp][cproc]
ax = next(af)
sns.kdeplot(sign, ax=ax, shade=True, legend=False, color=coh_g_p[cp], )
ax.grid(False)
ax.patch.set_alpha(.0)
ax.yaxis.set_label_coords(-0.5, .4)
# check that all plots in the column have the same y and x limits
# ax.set_ylim(0, .5)
ax.set_xlim(*xlims)
# hide all spines and ticklabels
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_yticklabels([])
ax.set_ylabel(cp, rotation=0, )
ax.plot([sign.median()]*2, [.8*x for x in ax.get_ylim()], 'k--', color='#222222')
if cp != us[-1]:
ax.set_xticklabels([])
sign = dm_signatures_sc_dst[cp][cproc]
ax = next(af)
sns.kdeplot(sign, ax=ax, shade=True, legend=False, color=coh_g_p[cp])
ax.grid(False)
ax.patch.set_alpha(.0)
ax.yaxis.set_label_coords(-0.1, .1)
# check that all plots in the column have the same y and x limits
ax.set_ylim(0, .5)
ax.set_xlim(-4, 4)
# hide all spines and ticklabels
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_yticklabels([])
ax.plot([sign.median()]*2, [.8*x for x in ax.get_ylim()], 'k--', color='#222222')
if cp != us[-1]:
ax.set_xticklabels([])
plt.subplots_adjust(hspace=-.5, wspace=.2)